# Required packages:
require(tidyverse)
require(tidymodels)
require(broom.mixed)
require(sandwich)
require(lmeresampler)
require(lmtest)
require(sf)

# Version used:
# broom.mixed_0.2.9.4
# sandwich_3.0-2  
# lmeresampler_0.2.2
# lmtest_0.9-40 
# sf_1.0-9

#### 1 - Calculation of robust SE for the differnt models ####
get_model_df <- function(models) {
  
  if (class(models)=="lm") {
    
    coef <- coeftest(models,
                     vcov = vcovHC,
                     type = "HC3",
                     cluster= ~id)
    
    tidy_model <- tidy(coef, conf.int = T, conf.level = .95)
    tidy_model_2 <- tidy(coef, conf.int = T, conf.level = .9)
    
    tidy_model$conf.9.high <- tidy_model_2$conf.high
    tidy_model$conf.9.low <- tidy_model_2$conf.low
    
    glanced_model <- glance(models) %>%
      select(nobs,
             r.squared,
             AIC,
             BIC) %>%
      pivot_longer(cols = everything(),
                   names_to = "term",
                   values_to = "estimate")
    tidy_model$conf.low
    tidy_model <- bind_rows(tidy_model, glanced_model)
    
    number <- ifelse(length(models$coefficient) < 16, 1, 2)
    number <- ifelse(grepl("interval",paste(as.character(models$call), collapse = "")), 3, number)
    
    tidy_model$id <- paste0(number, ": ", "Linear (OLS)")
    
  }
  
  if (class(models)=="Sarlm") {
    
    projection <- lm(models$tary ~ models$tarX - 1)
    names(projection$coefficients) <- names(models$coefficients)
    
    coef <- coeftest(projection,
                     vcov = vcovHC,
                     type = "HC3")
    
    tidy_model <- tidy(coef, conf.int = T, conf.level = .95)
    tidy_model_2 <- tidy(coef, conf.int = T, conf.level = .9)
    
    tidy_model$conf.9.high <- tidy_model_2$conf.high
    tidy_model$conf.9.low <- tidy_model_2$conf.low
    
    glanced_model <- glance(models) %>%
      select(nobs,
             r.squared,
             AIC,
             BIC)  %>%
      pivot_longer(cols = everything(),
                   names_to = "term",
                   values_to = "estimate")
    
    tidy_model <- bind_rows(tidy_model, glanced_model)
    
    number <- ifelse(length(models$coefficient) < 16, 4, 5)
    number <- ifelse(grepl("interval",paste(as.character(models$call), collapse = "")), 6, number)
    
    tidy_model$id <- paste0 <- paste0(number, ": ", "SAR (MLE)")
    
  }
  
  if (class(models)=="lmerModLmerTest") {
    
    set.seed(1234)
    bootstraped_se <-
      lmeresampler::bootstrap(
        model = models,
        .f = fixef,
        type = "wild",
        B = 1000,
        hccme = "hc3",
        aux.dist = "mammen"
      )
    
    tidy_model <- tidy(models, conf.int=T, conf.level = .95) %>% 
      filter(effect == "fixed")
    
    confint_boot_99 <- confint(bootstraped_se, level=.99) %>% 
      filter(type=="norm")
    
    confint_boot_95 <- confint(bootstraped_se, level=.95) %>% 
      filter(type=="norm")
    
    confint_boot_90 <- confint(bootstraped_se, level=.9) %>% 
      filter(type=="norm")
    
    tidy_model$conf.low <- confint_boot_95$lower
    tidy_model$conf.high <- confint_boot_95$upper
    
    tidy_model$conf.9.high <- confint_boot_90$upper
    tidy_model$conf.9.low <- confint_boot_90$lower
    
    tidy_model$conf.99.high <- confint_boot_99$upper
    tidy_model$conf.99.low <- confint_boot_99$lower
    
    tidy_model$std.error <- bootstraped_se$stat$se
    
    glanced_model <- glance(models) %>%
      select(AIC,
             BIC,
             nobs) %>%
      pivot_longer(cols = everything(),
                   names_to = "term",
                   values_to = "estimate")
    
    r2 <- unname(performance::r2_nakagawa(models)[[1]])
    r2 <- data.frame(term="r.squared", estimate = r2)
    
    glanced_model <- bind_rows(glanced_model, r2)
    
    tidy_model <- bind_rows(tidy_model, glanced_model)
    
    number <- ifelse(ncol(models@frame) < 12, 7, 8)
    number <- ifelse(grepl("interval",paste(as.character(models@call), collapse = "")), 9, number)
    
    tidy_model$id <-  paste0(number, ": ", "LME (MLE)")
    
  }
  
  # Name coefficeints ----
  tidy_model$term[tidy_model$term == "time"] <- "Distance (hours)"
  tidy_model$term[tidy_model$term == "risk_factorhigh"] <- "Risk (high)"
  tidy_model$term[tidy_model$term == "risk_factormedium"] <- "Risk (medium)"
  tidy_model$term[tidy_model$term == "pressure"] <- "Workload"
  tidy_model$term[tidy_model$term == "agent1"] <- "Agent"
  tidy_model$term[tidy_model$term == "agencies1"] <- "Agencies"
  tidy_model$term[tidy_model$term == "sector_11"] <- "Automotive"
  tidy_model$term[tidy_model$term == "sector_21"] <- "Electrical Engineering"
  tidy_model$term[tidy_model$term == "sector_31"] <- "Wood processing"
  
  tidy_model$term[tidy_model$term == "year2015"] <- "Year: 2015"
  tidy_model$term[tidy_model$term == "year2016"] <- "Year: 2016"
  tidy_model$term[tidy_model$term == "year2017"] <- "Year: 2017"
  tidy_model$term[tidy_model$term == "year2018"] <- "Year: 2018"
  tidy_model$term[tidy_model$term == "year2019"] <- "Year: 2019"
  tidy_model$term[tidy_model$term == "year2020"] <- "Year: 2020"
  
  tidy_model$term[tidy_model$term == "duplicate10"] <- "Duplicate visits (10km)"
  tidy_model$term[tidy_model$term == "duplicate20"] <- "Duplicate visits (20km)"
  tidy_model$term[tidy_model$term == "duplicate30"] <- "Duplicate visits (30km)"
  tidy_model$term[tidy_model$term == "duplicate40"] <- "Duplicate visits (40km)"
  
  tidy_model$term[tidy_model$term == "r.squared"] <- "(pseudo-)R2"
  tidy_model$term[tidy_model$term == "nobs"] <- "N"
  
  return(tidy_model)
}

#### 2 - Function for the creation of main regression tables ####

get_table <- function(models_df, file_name, full_table) {
  table <- models_df
  
  table$stars <- ""
  
  table$stars[(table$p.value) < 0.1 &
                is.na(table$conf.99.high)] <- "*"
  table$stars[(table$p.value) <  0.05 &
                is.na(table$conf.99.high)] <- "**"
  table$stars[(table$p.value) < 0.01 &
                is.na(table$conf.99.high)] <- "***"
  
  # for the mixed-effect models we check significance based on the bootstrap CI 
  
  table$stars[sign(table$conf.9.high) == sign(table$conf.9.low) &
                !is.na(table$conf.99.high)] <- "*"
  table$stars[sign(table$conf.high) == sign(table$conf.low) &
                !is.na(table$conf.99.high)] <- "**"
  table$stars[sign(table$conf.99.high) == sign(table$conf.99.low) &
                !is.na(table$conf.99.high)] <- "***"
  
  table$estimate <- ifelse(
    !is.na(table$std.error), 
    paste0(format(round(table$estimate, 2), nsamll = 2,trim=T),
           table$stars,
           "|",
           "(",
           format(round(table$std.error, 2), nsamll = 2, trim=T),
           ")"), format(round(table$estimate,2),nsmall=2, trim=T))
  
  table$estimate[table$term == "N"] <- 1497
  
  table <- table %>% 
    select(id,estimate,term) %>% 
    pivot_wider(names_from = id, values_from = estimate) %>% 
    tidyr::separate_rows(everything(), sep = "\\|") 
  
  if (all(grepl("Duplicate",table$term) == FALSE & full_table == F)) {
  
  main_table <-
    rbind(
      c(
        "DV",
        "Deviation",
        "Deviation",
        "Interval",
        "Deviation",
        "Deviation",
        "Interval",
        "Deviation",
        "Deviation",
        "Interval"
      ),
      table[1:8, ],
      table[13:18, ],
      c("Sector dummies", "✗", "✓", "✓", "✗", "✓", "✓", "✗", "✓","✓"),
      c("Yearly dummies", "✗", "✓", "✓", "✗", "✓", "✓", "✗", "✗","✗"),
      c("RE: Plant", "", "", "", "", "", "", "✓", "✓","✓"),
      c("RE: District:qtr.", "", "", "", "", "", "", "✓", "✓","✓"),
      table[9,],
      c("N (plants)", "", "", "", "", "", "", 801, 801,801),
      c("N (district:qtr.)", "", "", "", "", "", "", 102, 102,102),
      table[10:12,])
  
  main_table$term[seq(3,15,2)] <- ""
  
  main_table %>% 
    flextable() %>% 
    hline(i = 1, j = -1, border = NULL, part = "body") %>% 
    hline(i = 17, j = -1, border = NULL, part = "body") %>% 
    hline(i = 19, j = -1, border = NULL, part = "body") %>% 
    add_footer_lines(" *** p < 0.01; ** p < 0.05; * p < 0.1.
  Robust standard errors in parentheses (HC3), clustered (plant level) for models 1-3
  Nagelkerke pseudeo-R2 for models 4-6, Nakagawa pseudo-R2 (conditional) for models 7-9
  Abbreviations:
  LME = Linear mixed-effect model
  MLE = Maximum likelihood estimation
  OLS = Ordinary least squares
  SAR = Spatial autoregressive model") %>% 
    fontsize(size=7, part="all") %>% 
    width(j = -1, 1.6, unit = "cm") %>% 
    width(j = 1, 2.3, unit = "cm") %>% 
    align(
      j = -1,
      align = "center",
      part = "all") %>% 
    save_as_docx(path=(file_name))
  }
  
  if (any(grepl("Duplicate",table$term) == TRUE)) {
    
    main_table <-
      rbind(
        c(
          "",
          "Deviation",
          "Deviation",
          "Interval",
          "Deviation",
          "Deviation",
          "Interval",
          "Deviation",
          "Deviation",
          "Interval"
        ),
        table[1:10, ],
        table[15:20, ],
        c("Sector dummies", "✗", "✓", "✓", "✗", "✓", "✓", "✗", "✓","✓"),
        c("Yearly dummies", "✗", "✓", "✓", "✗", "✓", "✓", "✗", "✗","✗"),
        c("RE: Plant", "", "", "", "", "", "", "✓", "✓","✓"),
        c("RE: District:qtr.", "", "", "", "", "", "", "✓", "✓","✓"),
        table[11,],
        c("N (plants)", "", "", "", "", "", "", 801, 801,801),
        c("N (district:qtr.)", "", "", "", "", "", "", 102, 102,102),
        table[12:14,])
    
    main_table$term[seq(3,17,2)] <- ""
    
    main_table %>% 
      flextable() %>% 
      hline(i = 1, j = -1, border = NULL, part = "body") %>% 
      hline(i = 17, j = -1, border = NULL, part = "body") %>% 
      hline(i = 21, j = -1, border = NULL, part = "body") %>% 
      add_footer_lines(" *** p < 0.01; ** p < 0.05; * p < 0.1.
  Robust standard errors in parentheses (HC3), clustered (plant level) for models 1-3
  Nagelkerke pseudeo-R2 for models 4-6, Nakagawa pseudo-R2 (conditional) for models 7-9
  Abbreviations:
  LME = Linear mixed-effect model
  MLE = Maximum likelihood estimation
  OLS = Ordinary least squares
  SAR = Spatial autoregressive model") %>% 
      fontsize(size=7, part="all") %>% 
      width(j = -1, 1.6, unit = "cm") %>% 
      width(j = 1, 2.3, unit = "cm") %>% 
      align(
        j = -1,
        align = "center",
        part = "all") %>% 
      save_as_docx(path=(file_name))
  }
  
  if (full_table == TRUE) {
    
    main_table <-
      rbind(
        c(
          "",
          "Deviation",
          "Deviation",
          "Interval",
          "Deviation",
          "Deviation",
          "Interval",
          "Deviation",
          "Deviation",
          "Interval"
        ),
        table[1:8, ],
        table[13:36, ],
        c("RE: Plant", "", "", "", "", "", "", "✓", "✓","✓"),
        c("RE: District:qtr.", "", "", "", "", "", "", "✓", "✓","✓"),
        table[9,],
        c("N (plants)", "", "", "", "", "", "", 801, 801,801),
        c("N (district:qtr.)", "", "", "", "", "", "", 102, 102,102),
        table[10:12,])
    
    main_table$term[seq(3,34,2)] <- ""
    
    main_table %>% 
      flextable() %>% 
      hline(i = 1, j = -1, border = NULL, part = "body") %>% 
      hline(i = 34, j = -1, border = NULL, part = "body") %>% 
      hline(i = 36, j = -1, border = NULL, part = "body") %>% 
      add_footer_lines(" *** p < 0.01; ** p < 0.05; * p < 0.1.
  Robust standard errors in parentheses (HC3), clustered (plant level) for models 1-3
  Nagelkerke pseudeo-R2 for models 4-6, Nakagawa pseudo-R2 (conditional) for models 7-9
  Abbreviations:
  LME = Linear mixed-effect model
  MLE = Maximum likelihood estimation
  OLS = Ordinary least squares
  SAR = Spatial autoregressive model") %>% 
      fontsize(size=7, part="all") %>% 
      height(height=0.5, unit = "cm") %>% 
      width(j = -1, 1.6, unit = "cm") %>% 
      width(j = 1, 2.3, unit = "cm") %>% 
      align(
        j = -1,
        align = "center",
        part = "all") %>% 
      save_as_docx(path=(file_name))
  }
    
  
}


#### 3 - Function for duplicate analysis #####

get_duplciates <- function(df, distance_threshold){
  # Calculate distance matrix
  mbase <- df %>%
    st_distance()
  
  # Set threshold
  m <- distance_threshold
  units(m) <- ("m")
  
  # convert to binary based on threshold
  mbase <- ifelse(mbase < m, 1, 0)
  
  # extract dates
  date_vector <-  df$date
  
  # calculate date difference from 2010-01-01
  days_s2010 <- difftime(date_vector, as.Date("01/01/10", "%m/%d/%y"))
  
  # Calculate distance matrix based on dates
  dist_days <- as.matrix(dist(days_s2010, diag = TRUE, upper = TRUE))
  
  # Mark same dates NOTE: Same dates should share the same distance from the starting date
  dist_days <- ifelse(dist_days == 0, 1, 0)
  
  # Multiply distance matrix with date difference matrix
  mat_fin <- mbase * dist_days
  
  # Set diagonal vector to zero
  diag(mat_fin) <- 0
  
  
  # Calculate row sums, each observation with a rowsum higher than 1 can be counted as duplicate
  duplicate <- rowSums(mat_fin)
  duplicate <- ifelse(duplicate > 0, 1, 0)
  return(duplicate)
}


